knitr::opts_chunk$set(echo=TRUE)
knitr::opts_chunk$set(warning=FALSE, message=FALSE)
# Load libraries
# library(ggchromatic)
library(scater)
library(ggpubr)
library(igraph)
library(dittoSeq)
library(Rphenograph)
library(aricode)
library(viridis)
library(ggrepel)
library(LaplacesDemon)
library(DescTools)
library(RColorBrewer)
library(mclust)
# Set general input paths to all analysis files
analysis.path <- "/mnt/bb_dqbm_volume/analysis"
# Set path to masks
tonsil.path <- "/mnt/bb_dqbm_volume/data/Tonsil_th152"
lung.path <- "/mnt/bb_dqbm_volume/data/Immucan_lung"
# Import helper fncs
source("../analysis/helpers/helper_fnc.R")
first, we look at U computed using different training datasets. For this, we read in the results from CISI training on the Tonsil th152 and the Immucan lung dataset (tonsil_vs_lung.py).
## Read results
# Read in all results for tonsil and lung comparison into one dataframe
files <- list.files(analysis.path, "simulation_results.txt",
full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
files <- files[grepl(
"tonsil_vs_lung|Tonsil_th152/training/full/k_|Immucan_lung/training/subset", files)]
res <- lapply(files, read_results, type="res") %>%
bind_rows()
# Harmonize all dataset names
patterns <- unique(unlist(lapply(res$training, function(name){
if (length(str_split(name, "_")[[1]])==1){
name
}
})))
res$training <- unlist(lapply(res$training, function(pat){
patterns[str_detect(pat, fixed(patterns, ignore_case=TRUE))]
}))
res$dataset <- unlist(lapply(res$dataset, function(pat){
patterns[str_detect(pat, fixed(patterns, ignore_case=TRUE))]
}))
## Read U
u.files <- list.files(analysis.path, "gene_modules.csv",
full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
u.files <- u.files[grepl(
"tonsil_vs_lung|Tonsil_th152/training/full/k_|Immucan_lung/training/subset", u.files)]
u <- lapply(u.files, read_U, type="training") %>%
bind_rows() %>%
dplyr::rename(dataset=rep)
## Read X_test and X_simulated
# Specify X_test and X_simulated files to be read in
X.files <- list.files(analysis.path, "X_", full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
X.files <- X.files[grepl(
"tonsil_vs_lung|Tonsil_th152/training/full/k_|Immucan_lung/training/subset", X.files)]
X.files <- X.files[!grepl("_processed", X.files)]
# Read processed SCE to avoid computing UMAP, TSNE, ... all the time since it is computationally
# expensive
# X.files <- X.files[grepl("_processed", X.files)]
# Read in all sce experiments from saved anndata and save additional info in metadata
# (e.g. which dataset is used in training and testing, which k was used and if it
# is the ground truth or simulated X)
# Remove neg. values and transform counts?
X.list <- lapply(X.files, read_results, type="x")
X.list <- lapply(X.list, function(sce.temp){
pat <- metadata(sce.temp)
metadata(sce.temp)$dataset <- patterns[str_detect(pat$dataset,
fixed(patterns, ignore_case=TRUE))]
metadata(sce.temp)$training <- patterns[str_detect(pat$training,
fixed(patterns, ignore_case=TRUE))]
assay.name <- names(assays(sce.temp))
for (a in assay.name[-1]) {
assay(sce.temp, a) <- NULL
}
assay(sce.temp, "exprs") <- asinh(counts(sce.temp)/1)
sce.temp
})
# Add reduced dimensions
set.seed(220225)
X.list <- lapply(X.list, function(sce){
sce <- runUMAP(sce, exprs_values="exprs")
sce <- runTSNE(sce, exprs_values="exprs")
sce
})
# Save SCE to avoid computing UMAP, TSNE, ... all the time since it is computationally
# expensive
# lapply(1:(length(X.list)), function(i){saveRDS(X.list[i],
# paste0(gsub("\\..*", "", X.files[i]),
# "_processed.rds"))})
## Read masks
# Read in masks for tonsil data
masks.tonsil <- loadImages(file.path(tonsil.path, "steinbock/masks_deepcell"), as.is=TRUE)
mcols(masks.tonsil) <- DataFrame(sample_id=names(masks.tonsil))
# Read in masks for lung data
masks.lung <- loadImages(file.path(lung.path, "masks"), as.is=TRUE)
mcols(masks.lung) <- DataFrame(sample_id=names(masks.lung))
## Read original H5ad SCE objects
tonsil.original <- readH5AD(file.path(tonsil.path, "preprocessed_data/spe.h5ad"))
lung.original <- readH5AD(file.path(lung.path, "Lung_sce.h5ad"))
original.list <- list(tonsil.original, lung.original)
names(original.list) <- c("tonsil_tonsil", "lung_lung")
# Define celltype colours used by the rest of the script
celltype.col <- c(brewer.pal(12, "Paired"),
brewer.pal(8, "Pastel2")[-c(3,5,8)],
brewer.pal(12, "Set3")[-c(2,3,8,9,11,12)])
celltype.col <- c(celltype.col[1:length(unique(colData(lung.original)$celltype))],
"black")
names(celltype.col) <- c(as.character(unique(colData(lung.original)$celltype)), NA)
celltype.col <- celltype.col[colData(lung.original) %>%
as.data.frame() %>%
dplyr::pull(celltype) %>%
levels]
# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(tonsil.original)
rm(lung.original)
# For each different measurement of training results, plot a barplot comparing
# diff. k-sparsities, simulation types and training-test set combinations
for (measure in names(res)[2:10]) {
cat("##### ", measure, "\n")
# Reorder dataframe for plotting
data_temp <- res %>%
dplyr::select(dataset, training, simulation, k, !!measure) %>%
mutate(group=paste(training, dataset, sep="_"))
# Create barplot
results.barplot <- plot_cisi_results(data_temp, "group", measure, "k")
print(results.barplot)
cat("\n\n")
}
cor <- plot_U(u, "k", "dataset")
plot_U_membership(u, "k", "dataset")
# Calculate mantel test between U's of tonsil and lung for all k's
mantel <- lapply(cor, function(l){
mantel_test(l[[1]], l[[2]])$mantel_r
}) %>%
as.data.frame(col.names=names(cor), check.names=FALSE)
knitr::kable(mantel, digits=2,
col.names=paste0("k=", names(mantel)))
| k=1 | k=3 |
|---|---|
| 0.66 | 0.51 |
# Subset to training and test of tonsil data
X.tonsil1 <- keep(X.list, function(x){
metadata(x)$training=="tonsil" & metadata(x)$dataset=="tonsil" & metadata(x)$k=="1"})
names(X.tonsil1) <- lapply(X.tonsil1,
function(x){metadata(x)$ground_truth})
# Call plot_cells to get individual plots for each roi for decomposed and true
# results
poi <- "SMA"
X.imgList <- plot_cells(X.tonsil1, masks.tonsil,
poi)
# Plot decomposed vs true results for test roi (002)
X.img <- plot_grid(plotlist=append(X.imgList[grepl("20220520_TsH_th152_cisi1_002",
names(X.imgList))],
X.imgList[grepl("legend",
names(X.imgList))]),
ncol=2, labels=names(X.tonsil1),
label_size=15, hjust=c(-2, -1.5),
vjust=1, scale=0.9)
X.img
# Rename list of tonsil data for nicer titles in plotting
X.tonsil1Renamed <- lapply(names(X.tonsil1), function(n){
rownames(X.tonsil1[[n]]) <- paste(rownames(X.tonsil1[[n]]),
n, sep="\n")
reducedDims(X.tonsil1[[n]]) <- NULL
X.tonsil1[[n]]
})
names(X.tonsil1Renamed) <- names(X.tonsil1)
# Add decomposed and true dataset of tonsil into one SCE
X.overlayed <- do.call("rbind", X.tonsil1Renamed)
# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(X.tonsil1)
# Define protein of interest
pois <- c("CD20\nsimulated", "CD20\nground_truth")
# Plot cells coloured according to decomposed and true poi values
# (if similar in both should have a mixture of colours, e.g. orange)
X.imgDiff <- plotCells(mask=masks.tonsil[unique(colData(X.overlayed)$sample_id)],
object=X.overlayed,
cell_id="ObjectNumber", img_id="sample_id", colour_by=pois,
return_plot=TRUE, image_title=list(cex=1),
scale_bar=list(cex=1, lwidth=5),
legend=list(colour_by.title.cex=0.7, colour_by.labels.cex=0.7))
# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(X.tonsil1Renamed)
# Draw plot
ggdraw(X.imgDiff$plot)
Plot of arcsinh transformed counts coloured by defined celltype.
# Subset to dataset trained and tested on the Immucan lung dataset
X.lung_lung <- keep(X.list, function(x){
metadata(x)$training=="lung" & metadata(x)$dataset=="lung"})
names(X.lung_lung) <- lapply(X.lung_lung,
function(x){metadata(x)$ground_truth})
k_s <- unique(unlist(lapply(X.lung_lung, function(sce_temp){metadata(sce_temp)$k})))
for (k in k_s) {
cat("##### k =", k, "\n")
# Subset for k of interest
X.lung_lungK <- keep(X.lung_lung, function(x){metadata(x)$k==k})
# Plot asinh transformed counts of proteins of interest of decomposed and true
# datasets coloured by different celltypes
X.Exprs <- plot_grid(plot_exprs(X.lung_lungK, "B", "CD3", "CD20"),
plot_exprs(X.lung_lungK, "BnT", "CD3", "CD20"),
plot_exprs(X.lung_lungK, "Neutro", "MPO", "CD15"),
plot_exprs(X.lung_lungK, "Neutro", "CD3", "MPO"),
plot_exprs(X.lung_lungK, "DC", "CD11c", "CD68"),
ncol=1)
print(X.Exprs)
cat("\n\n")
}
Scatterplot of ground truth vs decomposed results per protein for k=1 and asinh transformed counts.
# Add all X_test counts together into dataframe for easier plotting
aoi <- "exprs"
X.df <- lapply(X.list, function(sce){
counts.long <- as.data.frame(assays(sce)[[aoi]]) %>%
mutate(protein=rownames(.)) %>%
melt() %>%
dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
cell=variable) %>%
mutate(k=metadata(sce)$k,
dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"))
}) %>% bind_rows() %>%
group_by(protein, cell, k, dataset) %>%
summarise_all(na.omit)
# For each test/training dataset combination plot for each true vs decomposed
# results in scatter plot and add diagonal and regression line, as well as
# R (pearson correlation coefficient)
for (i in unique(X.df$dataset)) {
cat("#####", i, "\n")
proteinPlot <- ggscatter(X.df %>%
dplyr::filter(k=="1" & dataset==i),
x="ground_truth", y="simulated",
add="reg.line",
color=pal_npg("nrc")("1"),
add.params=list(color=pal_npg("nrc")("4")[4],
size=2)) +
facet_wrap(~ protein, scales="free", ncol=5) +
theme_cowplot(10) +
geom_abline(slope=1, linetype="dashed") +
stat_cor(size=2)
print(proteinPlot)
cat("\n\n")
}
# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(X.df)
Correlation between ground truth and decomposed results per protein for k=1 and asinh transformed counts.
aoi <- "exprs"
# Calculate correlations between ground truth and simulated data for each protein
X.cor <- lapply(keep(X.list, function(sce){
metadata(sce)$dataset==metadata(sce)$training
}), function(sce){
counts.long <- as.data.frame(assays(sce)[[aoi]]) %>%
mutate(protein=rownames(.)) %>%
melt() %>%
dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
cell=variable) %>%
mutate(k=metadata(sce)$k,
dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"))
}) %>% bind_rows() %>%
group_by(protein, cell, k, dataset) %>%
summarise_all(na.omit) %>%
group_by(k, dataset, protein) %>%
summarize(correlation=cor(ground_truth, simulated)) %>%
dplyr::filter(k=="1") %>%
dplyr::select(-k)
# Set seed for reproducibility when using Mclust
set.seed(220224)
# Read in original SCE with all samples and proteins and without paper normalization
# of counts/exprs and compute statistics like mean, median, var,... on these
# Add this to above correlation dataframe
aoi.original <- "exprs"
X.cor <- lapply(names(original.list), function(n){
sce <- original.list[[n]]
counts.long <- as.data.frame(assays(sce)[[aoi.original]]) %>%
mutate(protein=rownames(.)) %>%
melt() %>%
dplyr::rename(ground_truth=value,
cell=variable) %>%
dplyr::filter(protein %in% unique(X.cor$protein)) %>%
group_by(protein) %>%
mutate(median=median(ground_truth),
mean=mean(ground_truth),
var=var(ground_truth),
dataset=n)
# Add signal to noise ratio and mean of positive class per protein
mat <- apply(assays(sce)[[aoi.original]], 1, function(x){
cur_model <- Mclust(x, G = 2)
mean1 <- mean(x[cur_model$classification == 1])
mean2 <- mean(x[cur_model$classification == 2])
signal <- ifelse(mean1 > mean2, mean1, mean2)
noise <- ifelse(mean1 > mean2, mean2, mean1)
return(c(snr=signal/noise, ps=signal))
})
counts.long <- counts.long %>%
merge(t(mat) %>%
as.data.frame() %>%
mutate(protein=colnames(mat)))
counts.long
}) %>% bind_rows() %>%
merge(X.cor) %>%
ungroup()
# Plot correlations for k=1 for each test/training dataset combination
for (i in unique(X.cor$dataset)) {
cat("#####", i, "\n")
proteinCorr <- plot_protein_cor(X.cor %>%
dplyr::filter(dataset==i))
print(proteinCorr)
cat("\n\n")
}
For a manual inspection of different protein distribution characteristics, we plot the boxplots of the original protein expressions coloured by correlation results. These show: Interquartile range (IQR), 1st and 3rd quantiles, median, mean, max, min and outliers.
# # Have a look at dittoRidgePlots of some good/bad performing proteins for lung and
# # tonsil
# tonsil.originalSubset <- tonsil.original
# colData(tonsil.originalSubset)$dummy <- "1"
# for (p in c("MPO", "Ki-67", "CD15", "CD8a", "CD45RO", "CD303", "PD-L1", "SMA",
# "TCF1/TCF7", "CD27", "CD14")){
# print(dittoRidgePlot(tonsil.originalSubset, var=p, group.by="dummy", assay="exprs") +
# ggtitle(p) +
# coord_cartesian(xlim=c(0, 6.2), expand=TRUE))
# }
# lung.originalSubset <- lung.original
# colData(lung.originalSubset)$dummy <- "1"
# for (p in c("MPO", "Ki-67", "CD20", "PD-L1", "PD-1", "CD14")){
# print(dittoRidgePlot(lung.originalSubset, var=p, group.by="dummy", assay="exprs") +
# ggtitle(p))
# }
# Plot boxplots of protein expr per dataset coloured by protein correlation results
# (performance)
proteinDist <- plot_protein_dist(X.cor)
print(proteinDist)
Correlation between ground truth and decomposed results per protein for k=1 and asinh transformed counts compared to protein characteristics of ground truth: Median, var, mean and mode_pos.
# Remove indidvidual cell levels to make data frame smaller bc of storage
X.cor <- X.cor %>%
dplyr::select(-c("cell", "ground_truth")) %>%
ungroup() %>%
distinct()
# Define characteristics of interest
coi <- c("median", "var", "mean", "snr")
# Plot correlations for k=1 for each test/training dataset combination
for (i in coi) {
cat("#####", i, "\n")
proteinChar <- ggplot(X.cor,
aes(x=!!sym(i), y=correlation, label=protein)) +
geom_point(color=pal_npg("nrc")("1")) +
facet_wrap(~ dataset, scales="free", ncol=2) +
theme_cowplot(10) +
stat_cor(size=2.8,
label.x.npc="center",
label.y.npc="bottom") +
stat_smooth(method="lm",
formula=y ~ log(x),
se=FALSE,
color=pal_npg("nrc")("4")[4], size=2) +
geom_text_repel(size=2.8)
print(proteinChar)
cat("\n\n")
}
# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(X.cor)
Another way to analyse the protein-wise results instead of the correlation is shown in the subsequent part. In this case, cells are ranked according to their expression in the ground truth data for each protein and we compute the cumulative sum. Then, we plot this against the cumulative sum of the simulated data where the cell order is determined by the expression values in the ground truth data. The results are shown for k=1.
# Collect all counts for each dataset, rank according to the counts and calculate
# the cumulative sum
# Calculate area between curves of ground_truth and simulated
cumsum.df <- lapply(X.list, function(sce){
counts.long <- as.data.frame(assays(sce)[["counts"]]) %>%
mutate(protein=rownames(.)) %>%
melt() %>%
group_by(protein) %>%
dplyr::arrange(desc(value)) %>%
mutate(cumsum=cumsum(value),
x=(0:(length(cumsum)-1)),
cumsum=cumsum/max(cumsum)) %>%
dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
cell=variable,
!!as.symbol(paste(metadata(sce)$ground_truth, "cumsum", sep="_")):=cumsum,
!!as.symbol(paste(metadata(sce)$ground_truth, "x", sep="_")):=x) %>%
mutate(k=metadata(sce)$k,
dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"))
counts.long
}) %>% bind_rows() %>%
group_by(protein, cell, k, dataset) %>%
summarise_all(na.omit)
# Plot cumulative sums for k=1 for each test/training dataset combination
for (i in unique(cumsum.df$dataset)) {
cat("#####", i, "\n")
protein.cumsum <- ggplot(cumsum.df %>%
dplyr::filter(k=="1" & dataset==i)) +
geom_step(aes(x=simulated_x, y=simulated_cumsum, color="simulated")) +
geom_step(aes(x=ground_truth_x, y=ground_truth_cumsum, color="ground_truth")) +
facet_wrap(~protein, scales="free") +
theme_cowplot(10) +
ylab("Cumulative sum of counts") +
xlab("Ordered cells") +
scale_colour_manual("",
breaks=c("simulated", "ground_truth"),
values=c(simulated=pal_npg("nrc")("1"),
ground_truth=pal_npg("nrc")("2")[2]))
print(protein.cumsum)
cat("\n\n")
}
# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(cumsum.df)
Below the UMAP and TSNE of the arcsinh counts for the Immucan lung dataset are shown using k=1. Red indicates a high expression of the particular protein.
# Plot UMAP and TSNE for k=1 and lung data
for (sce in keep(X.lung_lung, function(x){metadata(x)$k=="1"})){
cat("#####", metadata(sce)$training, "_", metadata(sce)$dataset, "\n")
# Create empty list for plots
X.redDimlist <- list()
# Extract for simulated and true results for UMAP and TSNE and plot and
# colour by celltype
for (method in c("UMAP", "TSNE")){
X.redDimlist <- append(X.redDimlist,
list(dittoDimPlot(sce,
var="celltype",
reduction.use=method,
size=0.2,
color.panel=celltype.col) +
theme_cowplot() +
theme(legend.position="bottom") +
guides(color=guide_legend(nrow=4,
byrow=TRUE,
override.aes=list(size=3)))))
}
# Plot simulated and true reduced dim on top of each other
X.redDimPlot <- plot_grid(plotlist=X.redDimlist,
ncol=2, labels=c("UMAP", "TSNE"),
label_size=15, hjust=c(-2, -1.5), vjust=1, scale=0.9)
print(X.redDimPlot)
cat("\n\n")
}
To investigate the clustering results of ground truth and simulated data a bit more, we show the overlap of clusters computed using Rphenograph (run with k=47 clusters) individually and compare which clusters overlap using a heatmap.
# Subset to data tested on lung
X.lung <- keep(X.list, function(x){
metadata(x)$dataset=="lung"})
names(X.lung) <- lapply(X.lung, function(sce){
paste(metadata(sce)$training, metadata(sce)$dataset,
metadata(sce)$k, metadata(sce)$ground_truth, sep="_")
})
# Compute clusters for each dataset using Rphenograph (for ground truth data,
# clusters are computed twice and an ARI score between them is calculated as a
# reference)
all.clusters <- lapply(X.lung,
function(sce){
mat <- t(assay(sce, "exprs"))
out1 <- Rphenograph(mat, k=100)
clusters <- as.vector(membership(out1[[2]])) %>%
as.data.frame() %>%
dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=".") %>%
mutate(dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"),
k=metadata(sce)$k,
celltype=colData(sce)$celltype,
cell=colData(sce)$cell_id)
if (metadata(sce)$ground_truth=="simulated"){
out2 <- Rphenograph(mat, k=100)
max_ari <- ARI(factor(membership(out1[[2]])), factor(membership(out2[[2]])))
clusters <- clusters %>%
mutate(max_ari=max_ari)
}
clusters
}) %>% bind_rows() %>%
group_by(k, dataset, cell, celltype) %>%
summarise_all(na.omit) %>%
ungroup()
# Define functions to get overview of celltypes per cluster as row and column annotations
get_anno_clusters <- function(clusters, i, d, direction="reverse", which_sim="row"){
col_anno <- HeatmapAnnotation(celltypes=anno_barplot(table(paste("gt", clusters %>%
dplyr::filter(k==i & dataset==d) %>%
pull(ground_truth)),
clusters %>%
dplyr::filter(k==i & dataset==d) %>%
pull(celltype)),
gp=gpar(fill=celltype.col),
bar_width=1,
height=unit(5, "cm")),
show_annotation_name=FALSE)
row_anno <- HeatmapAnnotation(celltypes=anno_barplot(table(paste("sim", clusters %>%
dplyr::filter(k==i & dataset==d) %>%
pull(simulated)),
clusters %>%
dplyr::filter(k==i & dataset==d) %>%
pull(celltype)),
gp=gpar(fill=celltype.col),
bar_width=1,
width=unit(5, "cm"),
axis_param=list(direction=direction)),
show_annotation_name=FALSE,
which=which_sim)
list(col_anno, row_anno)
}
# Plot heatmaps showing overlap of clusters for each dataset and k
for (i in unique(all.clusters$k)){
cat("##### k =", i, "\n")
plot.new()
ht_list <- NULL
for (d in unique(all.clusters$dataset)){
# Get table of number of cells being in simulated cluster i and ground truth
# cluster j
temp.matrix <- table(paste("sim", all.clusters %>%
dplyr::filter(k==i & dataset==d) %>%
pull(simulated)),
paste("gt", all.clusters %>%
dplyr::filter(k==i & dataset==d) %>%
pull(ground_truth))) %>%
as.matrix()
temp.matrix <- log10(temp.matrix + 10)
# Get barplot annotations of celltypes
anno.list <- get_anno_clusters(all.clusters, i, d)
col_anno <- anno.list[[1]]
row_anno <- anno.list[[2]]
# Create heatmap
cluster.overlap <- Heatmap(temp.matrix,
show_row_dend=FALSE,
show_column_dend=FALSE,
top_annotation=col_anno,
left_annotation=row_anno,
col=magma(100),
column_title=d,
row_names_gp=gpar(fontsize=8),
column_names_gp=gpar(fontsize=8),
column_title_gp=gpar(fontsize=10),
rect_gp=gpar(col="white", lwd=1))
# Add heatmap to list
ht_list <- append(ht_list,
list(grid.grabExpr(draw(cluster.overlap,
heatmap_legend_list=list(
Legend(labels=names(celltype.col),
title="Celltypes",
legend_gp=gpar(fill=celltype.col)))))))
}
print(plot_grid(plotlist=ht_list, nrow=1))
cat("\n\n")
}
We also compare the clusters of the ground truth data to the clsuters of the simulated data using the Kullback-Leibner divergence of the celltypes.
# Plot heatmaps showing overlap of clusters for each dataset and k
for (i in unique(all.clusters$k)){
cat("##### k =", i, "\n")
plot.new()
ht_list.kl <- NULL
for (d in unique(all.clusters$dataset)){
clusters.filtered <- all.clusters %>%
dplyr::filter(k==i & dataset==d) %>%
dplyr::select(-c("k", "dataset", "max_ari"))
clusters_sim.kl <- table(clusters.filtered %>%
dplyr::pull(celltype),
paste0("sim_", clusters.filtered %>%
dplyr::pull(simulated))) %>%
scale(center=FALSE, scale=colSums(.))
clusters_gt.kl <- table(clusters.filtered %>%
dplyr::pull(celltype),
paste0("gt_", clusters.filtered %>%
dplyr::pull(ground_truth))) %>%
scale(center=FALSE, scale=colSums(.))
clusters.kl <- apply(clusters_sim.kl, 2, function(x){
apply(clusters_gt.kl, 2, function(y){
KLD(x, y)$sum.KLD.py.px})
})
# Get barplot annotations of celltypes
anno.list <- get_anno_clusters(all.clusters, i, d)
col_anno <- anno.list[[1]]
row_anno <- anno.list[[2]]
# Create heatmap
clusters.klPlot <- Heatmap(t(clusters.kl),
show_row_dend=FALSE,
show_column_dend=FALSE,
top_annotation=col_anno,
left_annotation=row_anno,
col=colorRamp2(c(0, 0.1, 5), rev(magma(3))), #rev(magma(100)),
column_title=d,
row_names_gp=gpar(fontsize=8),
column_names_gp=gpar(fontsize=8),
column_title_gp=gpar(fontsize=10),
rect_gp=gpar(col="white", lwd=1))
# Add heatmap to list
ht_list.kl <- append(ht_list.kl,
list(grid.grabExpr(draw(clusters.klPlot))))
}
print(plot_grid(plotlist=ht_list.kl, nrow=1))
cat("\n\n")
}
Next, we assess clustering by looking at the median marker expression of each protein in each cluster and the compare it to the celltypes of each cluster. Ideally, each cluster has a median protein expression pattern that fits its main celltype.
for (i in unique(all.clusters$k)){
cat("##### k =", i, "\n")
plot.new()
ht_list.exprs <- NULL
for (d in unique(all.clusters$dataset)){
cluster_gt.exprs <- assay(X.lung[grepl(i, names(X.lung)) &
grepl(d, names(X.lung)) &
grepl("ground_truth", names(X.lung))][[1]], "exprs") %>%
as.data.frame() %>%
mutate(protein=rownames(.)) %>%
melt() %>%
dplyr::rename(cell=variable, expr=value)
cluster_gt.exprs <- all.clusters %>%
dplyr::filter(k==i & dataset==d) %>%
merge(cluster_gt.exprs) %>%
dplyr::select(-c("k", "max_ari", "dataset", "celltype"))
cluster_gt.exprs <- cluster_gt.exprs %>%
mutate(ground_truth=paste("gt", ground_truth)) %>%
group_by(ground_truth, protein) %>%
summarise(mean_expr=mean(expr)) %>%
pivot_wider(names_from=ground_truth, values_from=mean_expr) %>%
column_to_rownames("protein") %>%
as.matrix
cluster_sim.exprs <- assay(X.lung[grepl(i, names(X.lung)) &
grepl(d, names(X.lung)) &
grepl("simulated", names(X.lung))][[1]], "exprs") %>%
as.data.frame() %>%
mutate(protein=rownames(.)) %>%
melt() %>%
dplyr::rename(cell=variable, expr=value)
cluster_sim.exprs <- all.clusters %>%
dplyr::filter(k==i & dataset==d) %>%
merge(cluster_sim.exprs) %>%
dplyr::select(-c("k", "max_ari", "dataset", "celltype"))
cluster_sim.exprs <- cluster_sim.exprs %>%
mutate(simulated=paste("sim", simulated)) %>%
group_by(simulated, protein) %>%
summarise(mean_expr=mean(expr)) %>%
pivot_wider(names_from=simulated, values_from=mean_expr) %>%
column_to_rownames("protein") %>%
as.matrix
# Get barplot annotations of celltypes
anno.list <- get_anno_clusters(all.clusters, i, d, direction="normal", which_sim="column")
col_anno <- anno.list[[1]]
row_anno <- anno.list[[2]]
# Create heatmap
cluster.exprsPlotGt <- Heatmap(cluster_gt.exprs,
show_row_dend=FALSE,
show_column_dend=FALSE,
top_annotation=col_anno,
col=magma(100),
column_title="Ground truth",
row_names_gp=gpar(fontsize=8),
column_names_gp=gpar(fontsize=8),
column_title_gp=gpar(fontsize=10),
rect_gp=gpar(col="white", lwd=1),
heatmap_legend_param=list(title="Ground truth"))
cluster.exprsPlotSim <- Heatmap(cluster_sim.exprs,
show_row_dend=FALSE,
show_column_dend=FALSE,
top_annotation=row_anno,
col=magma(100),
column_title="Simulated",
row_names_gp=gpar(fontsize=8),
column_names_gp=gpar(fontsize=8),
column_title_gp=gpar(fontsize=10),
rect_gp=gpar(col="white", lwd=1),
heatmap_legend_param=list(title="Simulated"))
# Add heatmap to list
ht_list.exprs <- append(ht_list.exprs,
list(grid.grabExpr(draw(cluster.exprsPlotGt + cluster.exprsPlotSim,
heatmap_legend_list=list(
Legend(labels=names(celltype.col),
title="Celltypes",
legend_gp=gpar(fill=celltype.col))),
column_title=d))))
}
print(plot_grid(plotlist=ht_list.exprs, nrow=1))
cat("\n\n")
}
Finally, we compute the adjusted random index (ARI) between the clustering of the ground truth and simulated data to assess the overlap.
# Add position for labels to dataframe for easier annotation in ggplot
all.clusters <- all.clusters %>%
ungroup() %>%
mutate(point.x=ifelse(k=="3", -0.25, 0.25) + as.numeric(as.factor(dataset)))
# Create barplot for ARI (cluster overlap)
ari.plot <- ggplot(all.clusters %>%
group_by(dataset, k) %>%
summarize(ARI=ARI(simulated, ground_truth),
point.x=point.x,
max_ari=max_ari)) +
geom_bar(aes(x=dataset, y=ARI, fill=k), position="dodge", stat="identity") +
geom_point(aes(x=point.x, y=max_ari)) +
theme_cowplot() +
scale_fill_npg() +
coord_flip() +
ylim(0, 1)
print(ari.plot)
Since the ARI seems to be pretty bad in all cases, we have a look at another metric regarding cluster overlap between ground truth and simulated clusters. For that, we find the closest cluster in the simulated data for each ground truth cluster and compute the Jaccard index of the two.
# Calculate celltype probabilites per cluster
cluster.jaccard <- all.clusters %>%
group_by(k, dataset, ground_truth, simulated) %>%
summarise(count=n()) %>%
group_by(k, dataset, ground_truth) %>%
dplyr::filter(count==max(count)) %>%
dplyr::select(-count) %>%
ungroup()
# Calculate jaccard index between clusters using celltype probabilities
cluster.jaccard$jaccard <- lapply(1:nrow(cluster.jaccard), function(i){
df.temp <- all.clusters %>%
dplyr::filter(k==cluster.jaccard$k[i] &
dataset==cluster.jaccard$dataset[i] &
(ground_truth==cluster.jaccard$ground_truth[i] |
simulated==cluster.jaccard$simulated[i])) %>%
mutate(ground_truth=ifelse(ground_truth==cluster.jaccard$ground_truth[i], 1, 0),
simulated=ifelse(simulated==cluster.jaccard$simulated[i], 1, 0))
jaccard(df.temp %>%
pull(ground_truth),
df.temp %>%
pull(simulated))
}) %>% unlist()
# Plot jaccard index distribution per dataset and k
for (i in unique(cluster.jaccard$k)) {
cat("##### k =", i, "\n")
# Create empty plot list to append all dataset jaccard distribution plots to
plot.list <- list()
for (d in unique(cluster.jaccard$dataset)){
# Subset to dataset and k of interest
temp.data <- cluster.jaccard %>%
dplyr::filter(k==i & dataset==d)
# Create plot
cluster.overlapPlot <- ggplot(temp.data,
aes(x=jaccard)) +
geom_density(fill=pal_npg("nrc")("1")) +
theme_cowplot(10) +
geom_vline(aes(xintercept=mean(jaccard)),
linetype="dashed") +
geom_text(aes(x=mean(jaccard), label=round(mean(jaccard), 2)),
y=Inf, vjust=2, hjust=-2) +
xlim(0, 1)
plot.list <- append(plot.list, list(cluster.overlapPlot))
}
# Print plot
print(plot_grid(plotlist=plot.list, nrow=1,
labels=unique(cluster.jaccard$dataset),
label_size=15, hjust=c(-2, -1.5), vjust=1))
cat("\n\n")
}